Jędrzej Bogumił Lewandowski knowledge graph

Unicode normalization

Unicode supports equivalence meaning that the same character glyph can be encoded in various ways.

Example: Screenshot 2022-05-15 at 19.15.38.png Source: Unicode Annex #15: Unicode Normalization Forms

To solve this issue normalization is performed. Normalization is conversion of all characters in a given set to one of ther normalization forms. The best source is Annex #15 of Unicode specs.

There are four normalization forms: Screenshot 2022-05-15 at 19.18.24.png Source: Unicode Annex #15: Unicode Normalization Forms

Normalization gist

Simple Deno code to normalize paths in a dir:

import * as path from "https://deno.land/std@0.139.0/path/mod.ts";

const src = "path/to/dir/to/normalize"

const filepathsAbsolute = await getAllFiles(src, { filter: skipHiddenFilesFilter })
const filepathsRelative = filepathsAbsolute.map(fPath => path.relative(path.resolve(src), fPath))
filepathsRelative.sort()

for (const fPath of filepathsRelative) {
	const normalized = fPath.normalize("NFC")
	if (fPath !== normalized) {
	const mvSrc = `${src}/${fPath}`;
	const mvDst = `${src}/${normalized}`;
	Deno.renameSync(mvSrc, mvDst)
	console.log(`Normalized: ${normalized}`)
	}
}

async function getAllFiles(
  currentPath: string,
  o: { filter: (e: Deno.DirEntry) => boolean }
) {
  const paths = await _getAllFiles(currentPath, o)
  return paths.map(path => Deno.realPathSync(path))
}

const skipHiddenFilesFilter = (e: Deno.DirEntry) => !e.name.startsWith(".")
async function _getAllFiles(
  currentPath: string,
  o: { filter: (e: Deno.DirEntry) => boolean }
) {
  const names: string[] = [];

  for await (const dirEntry of Deno.readDir(currentPath)) {
    if (!o.filter(dirEntry)) continue;
    const entryPath = `${currentPath}/${dirEntry.name}`;
    names.push(entryPath);
    if (dirEntry.isDirectory) {
      names.push(...(await getAllFiles(entryPath, o)));
    }
  }

  return names;
}